# install.packages(c("tidyverse","readr","knitr","plotly","janitor","lubridate","rmarkdown","broom","ggpubr","kableExtra","DT"))
library(tidyverse)
library(readr)
library(knitr)
library(plotly)
library(janitor)
library(lubridate)
library(rmarkdown)
library(broom)
library(ggpubr)
library(kableExtra)
library(DT)
library(grid) # for unit()
# helpers (optional)
pretty_r2_table <- function(df, caption_text){
df %>%
kable("html", caption = caption_text) %>%
kable_styling(full_width = FALSE) %>%
row_spec(0, bold = TRUE)
}
pretty_cor_table <- function(x, y, var1_name = NULL, var2_name = NULL) {
cor_res <- cor.test(x, y)
tibble(
Variable1 = var1_name,
Variable2 = var2_name,
Correlation = round(cor_res$estimate, 3),
p.value = signif(cor_res$p.value, 3)
) %>%
kable("html", caption = paste0("Correlation: ", var1_name, " vs ", var2_name)) %>%
kable_styling(full_width = FALSE) %>%
row_spec(0, bold = TRUE)
}
set.seed(123)
opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE)
#set up data for use
#read data, skip empty 2024 column
data <- read_csv("EdStats_v01.csv", show_col_types=FALSE, col_types=cols(`2024`=col_skip()))
# filter for only USA data
data_clean <- data %>%
filter(grepl("USA", `Country code`))
#make sure no NA cols remain
if (length(colnames(data_clean)[colSums(is.na(data_clean)) == nrow(data_clean)]) == 0) {
message("No NA cols remain.")
} else {
message("NA columns remain.")
}
## No NA cols remain.
#save filtered dataset
write_csv(data_clean, "EdStats_USA.csv")
data_attend <- read_csv("EdStats_attend.csv", show_col_types = FALSE)
# Long -> add Type/Level -> filter -> make factors
data_long_attend <- data_attend %>%
pivot_longer(cols = `1975`:`2022`, names_to="Year", values_to="Value") %>%
mutate(
Year = as.numeric(Year),
Type = case_when(
str_detect(`Indicator name`, regex("attendance", ignore_case=TRUE)) ~ "Attendance",
str_detect(`Indicator name`, regex("enrolment|enrollment", ignore_case=TRUE)) ~ "Enrollment",
TRUE ~ "Other"
),
Level = case_when(
str_detect(`Indicator name`, regex("primary", ignore_case=TRUE)) ~ "Elementary School",
str_detect(`Indicator name`, regex("lower secondary", ignore_case=TRUE)) ~ "Middle School",
str_detect(`Indicator name`, regex("upper secondary", ignore_case=TRUE)) ~ "High School",
TRUE ~ "Other"
)
) %>%
filter(Type %in% c("Attendance","Enrollment"),
Level %in% c("Elementary School","Middle School","High School"),
!is.na(Value))
data_long_attend <- data_long_attend %>%
mutate(
Level = factor(Level, levels = c("Elementary School","Middle School","High School")),
Type = factor(Type, levels = c("Attendance","Enrollment"))
)
# Long -> WIDE so we have both variables in columns
data_wide <- data_long_attend %>%
select(Year, Level, Type, Value) %>%
pivot_wider(names_from = Type, values_from = Value) %>%
arrange(Level, Year) %>%
filter(!is.na(Attendance), !is.na(Enrollment))
# quick check
DT::datatable(head(data_wide), caption = "Preview of wide data (by Level × Year)")
# 1) Simple Overall Regression: Does Enrollment predict Attendance across all data?
m_overall <- lm(Attendance ~ Enrollment, data = data_wide)
DT::datatable(broom::tidy(m_overall),
caption = "Overall Model Coefficients (Attendance ~ Enrollment)")
# 2) Regressions by School Level (nest + glance)
glance_by_level <- data_wide %>%
tidyr::nest(data = -Level) %>%
mutate(model = purrr::map(data, ~ lm(Attendance ~ Enrollment, data = .x)),
stats = purrr::map(model, broom::glance)) %>%
tidyr::unnest(stats)
DT::datatable(glance_by_level %>% dplyr::select(Level, r.squared, adj.r.squared, p.value, nobs),
caption = "Model Quality Summary by School Level")
# 3) Multiple Regression with controls (Year + Level)
m_controls <- lm(Attendance ~ Enrollment + Year + Level, data = data_wide)
DT::datatable(broom::tidy(m_controls),
caption = "Multiple Regression with Year and Level Controls")
# First, create the text labels from our model summary table
panel_labels <- glance_by_level %>%
transmute(
Level,
label = paste0("R² = ", round(r.squared, 3), "\np = ", signif(p.value, 3))
)
# Determine the best position for the labels on each plot
label_pos <- data_wide %>%
group_by(Level) %>%
summarise(x = min(Enrollment, na.rm=T), y = max(Attendance, na.rm=T)) %>%
left_join(panel_labels, by = "Level")
p_reg <- ggplot(data_wide, aes(x = Enrollment, y = Attendance)) +
geom_point(aes(color = Level), alpha = 0.8, show.legend = FALSE) +
geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
facet_wrap(~Level, scales = "free") +
geom_label(data = label_pos, aes(x = x, y = y, label = label),
hjust = 0, vjust = 1, size = 3.5, label.padding = unit(0.3, "lines")) +
labs(
title = "Attendance vs. Enrollment with OLS Fits by School Level (USA)",
subtitle = "Labels show R-squared and model p-value for each level-specific regression",
x = "Enrollment Rate (%)", y = "Attendance Rate (%)"
) +
theme_minimal(base_size = 14) +
theme(strip.text = element_text(face = "bold"))
print(p_reg)

# Diagnostic Plots for the Main Overall Model
par(mfrow = c(2, 2))
plot(m_overall)

par(mfrow = c(1, 1))
# Interactive Plotly Version for Exploring Data Points
p_interactive <- ggplot(data_wide, aes(x = Enrollment, y = Attendance)) +
geom_point(aes(color = Level, text = paste("Year:", Year)), alpha = 0.8) +
facet_wrap(~Level, scales = "free") +
labs(
title = "Interactive Plot of Attendance vs. Enrollment",
x = "Enrollment Rate (%)", y = "Attendance Rate (%)"
) +
theme_minimal(base_size = 12)
# Convert the ggplot object to a Plotly object
ggplotly(p_interactive, tooltip = "text")